import pandas as pd
# Load the datasets into single dataframe
df1 = pd.read_csv('data/dad-a-base.csv')
df2 = pd.read_csv('data/reddit_dadjokes.csv')
df3 = pd.read_csv('data/shortjokes.csv')
df = pd.concat([df1['Joke'], df2['joke'], df3['Joke']], ignore_index=True)
df = pd.DataFrame(df, columns=['joke'])
# Define keywords related to data science
keywords = ['data', 'algorithm', 'computer', 'code', 'math', 'statistics', 'machine', 'analytics', 'algorithm', 'python', 'visualization', 'engineer', 'scientist', 'nlp', 'gpt']
# Filter jokes based on keywords (441727 -> 7516)
df_filtered = df[df['joke'].str.contains('|'.join(keywords), case=False, na=False)]
df_filtered.head()
| joke | |
|---|---|
| 44 | It was so cold yesterday my computer froze. My... |
| 209 | What do computers and air conditioners have in... |
| 211 | Scientists finally did a study on forks. It's ... |
| 302 | Did you hear about the scientist who was lab p... |
| 363 | There are two types of people in this world, t... |
"""
Cell generated by Data Wrangler.
"""
def clean_data(df_filtered):
# Drop duplicate rows across all columns
df_filtered.drop_duplicates(inplace=True)
return df_filtered
df_filtered_clean = clean_data(df_filtered.copy())
df_filtered_clean.head()
| joke | |
|---|---|
| 44 | It was so cold yesterday my computer froze. My... |
| 209 | What do computers and air conditioners have in... |
| 211 | Scientists finally did a study on forks. It's ... |
| 302 | Did you hear about the scientist who was lab p... |
| 363 | There are two types of people in this world, t... |
df_filtered.describe()
| joke | |
|---|---|
| count | 7609 |
| unique | 7518 |
| top | Did you hear about the constipated mathematici... |
| freq | 6 |
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white').generate(' '.join(df_filtered['joke']))
# Display the word cloud
plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
import plotly.express as px
from collections import defaultdict
# Create a dictionary to store words and their corresponding jokes
word_to_jokes = defaultdict(list)
for joke in df_filtered['joke']:
for word in joke.split():
word_to_jokes[word].append(joke)
# Create a DataFrame for plotting
plot_data = pd.DataFrame(wordcloud.words_.items(), columns=['word', 'freq'])
plot_data['joke'] = plot_data['word'].map(lambda x: '\n'.join(word_to_jokes[x]))
# Create an interactive scatter plot (word cloud) for top 10 frequent words
top_10 = plot_data.nlargest(10, 'freq')
fig = px.scatter(top_10, x='word', y='freq', text='word', size='freq', hover_data=['joke'])
fig.update_traces(textposition='top center', textfont_size=14)
fig.update_layout(showlegend=False, xaxis_visible=False, yaxis_visible=False)
fig.show()